
import logging
from init import PATHS
LOGGER = logging.getLogger(__name__)
import os
import pandas as pd
import numpy as np
from C_PatentVariables import firm_year_aggregation


def main():
    LOGGER.info('SCRIPT: f_overlap_and_OtherFirms_in_ipc_cpc_transpo.py')
    df = get_all_families_and_where_they_come_from()
    df = add_psnsector(df)
    df.to_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Families_overlap.csv', index=False)
    exploring(df)
    Output_panel_firms_of_cpcipctranspo()
    explore_naics_firms_of_cpcipctranspo()
    Output_Families_Bat_FC_naics_info()
    LOGGER.info('SCRIPT END: f_overlap_and_OtherFirms_in_ipc_cpc_transpo.py')


def get_all_families_and_where_they_come_from():
    df = pd.read_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Families_of_OEMs.csv', usecols=['docdb_family_id'])
    df['OEM'] = True
    df1 = pd.read_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Families_of_subsidiaries.csv', usecols=['docdb_family_id'])
    df1['Subsidiaries'] = True
    df = df.merge(df1, on='docdb_family_id', how='outer')
    df1 = pd.read_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Families_of_suppliers.csv', usecols=['docdb_family_id'])
    df1['Suppliers'] = True
    df = df.merge(df1, on='docdb_family_id', how='outer')
    df1 = pd.read_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Families_of_ipc_cpc_transpo.csv', usecols=['docdb_family_id'])
    df1['CPCIPCTranspo'] = True
    df = df.merge(df1, on='docdb_family_id', how='outer')
    df1 = pd.read_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Families_of_nace_motor.csv', usecols=['docdb_family_id'])
    df1['NaceMotor'] = True
    df = df.merge(df1, on='docdb_family_id', how='outer')
    df = df.drop_duplicates()
    df['docdb_family_id'].nunique()   # 10 818 489
    df = df.fillna(False)
    return df


def add_psnsector(df):
    colnames = ['docdb_family_id', 'psn_sector']
    df_fam = []
    df_fam.append(pd.read_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Families_of_ipc_cpc_transpo_psnsector.csv', usecols=colnames))
    df_fam.append(pd.read_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Families_of_OEMs_psnsector.csv', usecols=colnames))
    df_fam.append(pd.read_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Families_of_subsidiaries_psnsector.csv', usecols=colnames))
    df_fam.append(pd.read_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Families_of_suppliers_psnsector.csv', usecols=colnames))
    df_fam.append(pd.read_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Families_of_nace_motor_psnsector.csv', usecols=colnames))
    df_fam = pd.concat(df_fam).drop_duplicates()
    df = df.merge(df_fam, on='docdb_family_id', how='left')
    df['psn_sector'] = df['psn_sector'].str.replace(' ', ',')
    df['psn_sector'] = df['psn_sector'].fillna('')
    list_type = sorted(list(set([i for k in df['psn_sector'].unique() for i in k.split(',') if i != ''])))
    for col in list_type:
        df[col] = df['psn_sector'].apply(lambda x: True if col in x else False)
    df['UNKNOWN'] = df['psn_sector'].apply(lambda x: True if x == 'UNKNOWN' else False)
    df['psn_sector_missing'] = df['psn_sector'].apply(lambda x: True if x == '' else False)
    # Drop duplicates
    df['length'] = df['psn_sector'].str.len()
    df = df.sort_values(by='length', ascending=False)
    df = df.drop_duplicates(subset='docdb_family_id', keep='first')
    df = df.drop(columns=['length', 'psn_sector'])
    return df


def print_stats(df):
    LOGGER.info('DIAGNOSTICS:')
    number = df[df['CPCIPCTranspo']]['docdb_family_id'].nunique()  # 5 064 083
    LOGGER.info(f'Number families CPCIPCTranspo: {number}')
    number = df[df['CPCIPCTranspo'] & (df['OEM'] | df['Subsidiaries'] | df['Suppliers'])]['docdb_family_id'].nunique()/df[df['CPCIPCTranspo']]['docdb_family_id'].nunique()
    LOGGER.info(f'Percentage of families CPCIPCTranspo that are OEMS, subsi or suppliers: {number}')
    number = df[df['CPCIPCTranspo'] & df['OEM']]['docdb_family_id'].nunique()  # 285 168
    LOGGER.info(f'Number families CPCIPCTranspo & OEMs: {number}')
    number = df[df['CPCIPCTranspo'] & df['OEM']]['docdb_family_id'].nunique()/df[df['CPCIPCTranspo']]['docdb_family_id'].nunique()  # 5%
    LOGGER.info(f'Percentage of families CPCIPCTranspo that are OEMs: {number}')
    number = df[df['CPCIPCTranspo'] & (df['OEM'] | df['Subsidiaries'])]['docdb_family_id'].nunique()   # 326890
    LOGGER.info(f'Number families CPCIPCTranspo & OEMs or Subsidiaries: {number}')
    number = df[df['CPCIPCTranspo'] & (df['OEM'] | df['Subsidiaries'])]['docdb_family_id'].nunique()/df[df['CPCIPCTranspo']]['docdb_family_id'].nunique()     # 6.5%
    LOGGER.info(f'Percentage of families CPCIPCTranspo that are OEMs or subsidiaries: {number}')
    number = df[df['CPCIPCTranspo'] & df['Suppliers']]['docdb_family_id'].nunique()  # 467679
    LOGGER.info(f'Number families CPCIPCTranspo & Suppliers: {number}')
    number = df[df['CPCIPCTranspo'] & df['Suppliers']]['docdb_family_id'].nunique()/df[df['CPCIPCTranspo']]['docdb_family_id'].nunique()  # 9%
    LOGGER.info(f'Percentage of families CPCIPCTranspo that are suppliers: {number}')
    number = df[df['OEM'] | df['Subsidiaries']]['docdb_family_id'].nunique()   # 1065699
    LOGGER.info(f'Number families OEMs or Subsidiaries: {number}')
    number = df[df['CPCIPCTranspo'] & (df['OEM'] | df['Subsidiaries'])]['docdb_family_id'].nunique()/df[df['OEM'] | df['Subsidiaries']]['docdb_family_id'].nunique()
    LOGGER.info(f'Percentage of families OEMS or subsi that are CPCIPCTranspo: {number}')
    number = df[df['Suppliers']]['docdb_family_id'].nunique()   # 5521275
    LOGGER.info(f'Number families Suppliers: {number}')
    number = df[df['CPCIPCTranspo'] & df['Suppliers']]['docdb_family_id'].nunique()/df[df['Suppliers']]['docdb_family_id'].nunique()
    LOGGER.info(f'Percentage of families suppliers that are CPCIPCTranspo: {number}')
    number = df[df['psn_sector_missing']]['docdb_family_id'].nunique() / df['docdb_family_id'].nunique()
    LOGGER.info(f'Percentage of families with psn_sector_missing: {number}')
    number = df[df['UNKNOWN']]['docdb_family_id'].nunique() / df['docdb_family_id'].nunique()    # 1%
    LOGGER.info(f'Percentage of families with psn_sector UNKNOWN: {number}')
    number = df[df['COMPANY']]['docdb_family_id'].nunique() / df['docdb_family_id'].nunique()    # 83.5%
    LOGGER.info(f'Percentage of families with psn_sector COMPANY: {number}')
    number = df[df['GOV']]['docdb_family_id'].nunique() / df['docdb_family_id'].nunique()   # 1.6%
    LOGGER.info(f'Percentage of families with psn_sector GOV: {number}')
    number = df[df['HOSPITAL']]['docdb_family_id'].nunique() / df['docdb_family_id'].nunique()      # 0%
    LOGGER.info(f'Percentage of families with psn_sector HOSPITAL: {number}')
    number = df[df['INDIVIDUAL']]['docdb_family_id'].nunique() / df['docdb_family_id'].nunique()      # 14.5%
    LOGGER.info(f'Percentage of families with psn_sector INDIVIDUAL: {number}')
    number = df[df['UNIVERSITY']]['docdb_family_id'].nunique() / df['docdb_family_id'].nunique()      # 2.9%
    LOGGER.info(f'Percentage of families with psn_sector UNIVERSITY: {number}')
    number = df[df['NON-PROFIT']]['docdb_family_id'].nunique() / df['docdb_family_id'].nunique()       # 1.6%
    LOGGER.info(f'Percentage of families with psn_sector NON-PROFIT: {number}')
    # CPCIPCTranspo outside of those connected to our firms
    mask = df['CPCIPCTranspo'] & ~(df['OEM'] | df['Subsidiaries'] | df['Suppliers'])
    number = df[mask & df['psn_sector_missing']]['docdb_family_id'].nunique() / df[mask]['docdb_family_id'].nunique()
    LOGGER.info(f'Percentage of families with psn_sector_missing among families in CPCIPCTranspo but not in OEMs/subsi/suppliers: {number}')
    number = df[mask & df['UNKNOWN']]['docdb_family_id'].nunique() / df[mask]['docdb_family_id'].nunique()
    LOGGER.info(f'Percentage of families with psn_sector UNKNOWN among families in CPCIPCTranspo but not in OEMs/subsi/suppliers: {number}')
    number = df[mask & df['COMPANY']]['docdb_family_id'].nunique() / df[mask]['docdb_family_id'].nunique()
    LOGGER.info(f'Percentage of families with psn_sector COMPANY among families in CPCIPCTranspo but not in OEMs/subsi/suppliers: {number}')
    number = df[mask & df['GOV']]['docdb_family_id'].nunique() / df[mask]['docdb_family_id'].nunique()
    LOGGER.info(f'Percentage of families with psn_sector GOV among families in CPCIPCTranspo but not in OEMs/subsi/suppliers: {number}')
    number = df[mask & df['HOSPITAL']]['docdb_family_id'].nunique() / df[mask]['docdb_family_id'].nunique()
    LOGGER.info(f'Percentage of families with psn_sector HOSPITAL among families in CPCIPCTranspo but not in OEMs/subsi/suppliers: {number}')
    number = df[mask & df['INDIVIDUAL']]['docdb_family_id'].nunique() / df[mask]['docdb_family_id'].nunique()
    LOGGER.info(f'Percentage of families with psn_sector INDIVIDUAL among families in CPCIPCTranspo but not in OEMs/subsi/suppliers: {number}')
    number = df[mask & df['UNIVERSITY']]['docdb_family_id'].nunique() / df[mask]['docdb_family_id'].nunique()
    LOGGER.info(f'Percentage of families with psn_sector UNIVERSITY among families in CPCIPCTranspo but not in OEMs/subsi/suppliers: {number}')
    number = df[mask & df['NON-PROFIT']]['docdb_family_id'].nunique() / df[mask]['docdb_family_id'].nunique()
    LOGGER.info(f'Percentage of families with psn_sector NON-PROFIT among families in CPCIPCTranspo but not in OEMs/subsi/suppliers: {number}')


def add_year(df):
    colnames = ['docdb_family_id', 'earliest_filing_year']
    df_fam = []
    df_fam.append(pd.read_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Families_of_ipc_cpc_transpo.csv', usecols=colnames))
    df_fam.append(pd.read_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Families_of_OEMs.csv', usecols=colnames))
    df_fam.append(pd.read_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Families_of_subsidiaries.csv', usecols=colnames))
    df_fam.append(pd.read_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Families_of_suppliers.csv', usecols=colnames))
    df_fam.append(pd.read_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Families_of_nace_motor.csv', usecols=colnames))
    df_fam = pd.concat(df_fam).drop_duplicates()
    df = df.merge(df_fam, on='docdb_family_id', how='left')
    return df


def add_ifconnectedtobvdid(df):
    LOGGER.info('DIAGNOSTICS: Overlap between COMPANY in psn_sector and connection to bvdid')
    PatstatLinks = pd.read_csv(PATHS.privatedata / 'Patstat_orbis_correspondence/Orbis_PATSTAT_updatePEM_anonymized.csv', usecols=['docdb_family_id'])
    df = df.merge(PatstatLinks.drop_duplicates(), on='docdb_family_id', how='left', indicator=True)
    df = df.rename(columns={'_merge': 'bvdid'})
    df['bvdid'] = df['bvdid'].apply(lambda x: True if x == 'both' else False)
    ntotal = df['docdb_family_id'].nunique()
    LOGGER.info(f'Total number of families: {ntotal}')
    number = df[df['COMPANY']]['docdb_family_id'].nunique() / df['docdb_family_id'].nunique()    # 83.5%
    LOGGER.info(f'Percentage of families with psn_sector COMPANY: {round(100*number,2)}%')
    number = df[df['bvdid']]['docdb_family_id'].nunique() / df['docdb_family_id'].nunique()    # 90%
    LOGGER.info(f'Percentage of families associated with at least one bvdid: {round(100*number,2)}%')
    number = df[df['bvdid'] & (df['COMPANY'] == False)]['docdb_family_id'].nunique()
    LOGGER.info(f'Number of families associated to bvdids but not tagged as COMPANY by psn_sector: {number}')
    LOGGER.info(f'Percentage of families associated to bvdids but not tagged as COMPANY by psn_sector: {round(100*number/ntotal,2)}%')
    number = df[df['COMPANY'] & (df['bvdid'] == False)]['docdb_family_id'].nunique()
    LOGGER.info(f'Number of families tagged as COMPANY by psn_sector but not associated to bvdids: {number}')
    LOGGER.info(f'Percentage of families tagged as COMPANY by psn_sector but not associated to bvdids: {round(100*number/ntotal,2)}%')
    LOGGER.info(f'Families tagged as COMPANY by psn_sector but not associated to bvdids, by earliest_filing_year year')
    LOGGER.info(f'NB: First number printed is the number of families tagged as COMPANY but not associated to bvdids')
    LOGGER.info(f'NB: The percentage in parenthesis is relative to the number of families filed in year t tagged as COMPANY')
    for yeart in sorted(df['earliest_filing_year'].value_counts().index):
        maskyear = df['earliest_filing_year'] == yeart
        number = df[maskyear & df['COMPANY'] & (df['bvdid'] == False)]['docdb_family_id'].nunique()
        if number == 0:
            continue
        ntotalCOMP = df[df['COMPANY'] & maskyear]['docdb_family_id'].nunique()
        ntotal = df[maskyear]['docdb_family_id'].nunique()
        LOGGER.info(f'earliest_filing_year {yeart}: {number} ({round(100*number/ntotal, 2)}%)   [{ntotalCOMP} fam were tagged COMPANY that year (~ {round(100*ntotalCOMP/ntotal, 2)}% of families that year)]')
    return


def exploring(df):
    print_stats(df)
    df = add_year(df)
    add_ifconnectedtobvdid(df)


def collect_naics(df_panel):
    # Collect naics
    LOGGER.info('collect_naics of Firms of Families of ipc cpc transpo')
    list_countries = df_panel['bvdid'].str[:2].unique().tolist()
    coltoimport = ['BvD ID number', 'NAICS, Primary code(s)', 'NAICS, Primary code(s), text description']
    data = []
    for co in list_countries:
        for indusclass in pd.read_csv(PATHS.orbis / f'2018/Industry_classifications/by_country/csv/Industry_classifications-{co}.csv', sep='\t', usecols=coltoimport, chunksize=1000000):
            indusclass = indusclass[indusclass['NAICS, Primary code(s)'].notnull()]
            indusclass = indusclass.rename(columns={'BvD ID number': 'bvdid'})
            indusclass = indusclass[indusclass['bvdid'].isin(df_panel['bvdid'].dropna().unique().tolist())]
            data.append(indusclass)
    data = pd.concat(data).drop_duplicates()
    data.to_csv(PATHS.dropbox / 'Data_outputted/temp/FirmsNaics_of_Families_of_ipc_cpc_transpo.csv', index=False)
    return


def collect_names(df_panel):
    # Collect names
    LOGGER.info('collect names of Firms of Families of ipc cpc transpo')
    list_countries = df_panel['bvdid'].str[:2].unique().tolist()
    data_names = []
    for co in list_countries:
        try:
            for names in pd.read_csv(PATHS.orbis / f'2019/Descriptive/BvD_ID_and_Name/by_country/csv/BvD_ID_and_Name-{co}.csv', sep='\t', chunksize=1000000):
                names = names.rename(columns={'BvD ID number': 'bvdid'})
                names = names[names['bvdid'].isin(df_panel['bvdid'].dropna().unique().tolist())]
                data_names.append(names)
        except Exception as exception:
            LOGGER.info(f'Error for country: {co}')
            LOGGER.info('Exception: {} / {} / {}'.format(type(exception), exception.args, exception))
    data_names = pd.concat(data_names).drop_duplicates()
    data_names.to_csv(PATHS.dropbox / 'Data_outputted/temp/FirmsNames_of_Families_of_ipc_cpc_transpo.csv', index=False)
    return


def Output_panel_firms_of_cpcipctranspo():
    LOGGER.info('Make panel of Firms of Families of ipc cpc transpo')
    # Get a df with bvdids, docdbids and year for all docdb ids in Families of ipc cpc transpo
    df_bvdid_docdbid = firm_year_aggregation.get_docdbids_of_bvdids('ipc_cpc_transpo')
    # Drop docdbids that are also associated to oems, subsi or suppliers.
    df = pd.read_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Families_overlap.csv')
    mask = df['CPCIPCTranspo'] & ~(df['OEM'] | df['Subsidiaries'] | df['Suppliers'])
    df_bvdid_docdbid = df_bvdid_docdbid[df_bvdid_docdbid['docdb_family_id'].isin(df[mask]['docdb_family_id'])]
    df_agg_corres = df_bvdid_docdbid.copy(deep=True)
    df_agg_corres['Year'] = df_agg_corres['earliest_filing_year']
    # Collect names and naics from orbis - the output will be used in firm_year_aggregation.main()
    collect_names(df_bvdid_docdbid)
    collect_naics(df_bvdid_docdbid)
    firm_year_aggregation.main(df_agg_corres, df_bvdid_docdbid, aggregation_level='bvdid', namefile='OtherFirms_in_ipc_cpc_transpo')
    return


def explore_naics_firms_of_cpcipctranspo():
    LOGGER.info('explore_naics_firms_of_cpcipctranspo')
    df_firmyear = pd.read_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Panel_OtherFirms_in_ipc_cpc_transpo_FamInfo.csv')
    # Print top patenters
    mask = df_firmyear['earliest_filing_year'].isin(range(1990, 2020))
    crosssection = df_firmyear[mask].groupby(['bvdid', 'name', 'NAICS, Primary code(s)'])['Count'].sum().sort_values()
    crosssection = crosssection.reset_index().rename(columns={'Count': 'Count19902019'})
    LOGGER.info(f'Top patenters in cpc ipc transpo that are not oems/subsi/supppliers \n Total counts of families filed between 1990-2019 \n {crosssection.iloc[-10:, :]}')
    crosssection['NAICS_4'] = crosssection['NAICS, Primary code(s)'].astype(str).str[:4]
    crosssection['NAICS_2'] = crosssection['NAICS, Primary code(s)'].astype(str).str[:2]
    crosssection = crosssection.sort_values(by='Count19902019', ascending=False).reset_index().drop(columns='index')
    crosssection['cumsum_percent'] = 100 * crosssection['Count19902019'].cumsum() / crosssection['Count19902019'].cumsum().iloc[-1]
    crosssection.to_csv(PATHS.dropbox / 'Data_outputted/temp/Crosssection_OtherFirms_in_ipc_cpc_transpo_naics.csv', index=False)
    LOGGER.info('Top NAICS code in cpc-ipc transpo firms that are not oems/subsi/suppliers')
    n = (100 * crosssection['NAICS, Primary code(s)'].value_counts() / crosssection.shape[0]).iloc[:10]
    LOGGER.info(f'\nTop 6-digit NAICS code (in percentage)\n{n}')
    n = (100 * crosssection['NAICS_4'].value_counts() / crosssection.shape[0]).iloc[:10]
    LOGGER.info(f'\nTop 4-digit NAICS code (in percentage)\n{n}')
    n = (100 * crosssection['NAICS_2'].value_counts() / crosssection.shape[0]).iloc[:10]
    LOGGER.info(f'\nTop 2-digit NAICS code (in percentage)\n{n}')


def get_batfc_families_naics_info():
    # we need to add NAICS and subsector info to Families_overlap.csv (for Bat and FC) and years
    cols = ['docdb_family_id', 'Sub-sector', 'BatTrans']
    df_batfc_fam = pd.read_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Families_of_ipc_cpc_transpo_cpc_ipc.csv', usecols=cols)
    # keep only if bat and fc in type
    df_batfc_fam = df_batfc_fam[df_batfc_fam['Sub-sector'].str.contains('batteries|fuel cells', regex=True)]
    df_batfc_fam = firm_year_aggregation.cleaning_subsectors(df_batfc_fam)
    # add bvdids
    PatstatLinks = pd.read_csv(PATHS.privatedata / 'Patstat_orbis_correspondence/Orbis_PATSTAT_updatePEM_anonymized.csv', usecols=['docdb_family_id', 'bvdid']).drop_duplicates()
    df_batfc_fam = df_batfc_fam.merge(PatstatLinks, on='docdb_family_id', how='left')
    data_naics = get_naics_info_and_clean_to_have_a_df_at_uniquebvdid_level()
    df_batfc_fam = df_batfc_fam.merge(data_naics, on='bvdid', how='left')
    df_batfc_fam = df_batfc_fam.sort_values(by=['docdb_family_id', 'bvdid'])
    # Add if OEMS or subsidiaries
    OEMsandSubsi = pd.read_csv(PATHS.dropbox / 'Data_outputted/A_AutoIndustry/OEM_and_Subsidiaries.csv', usecols=['Level1_bvdid', 'Level2_bvdid', 'Sub_BvDID'])
    OEMsandSubsi = list(pd.concat([OEMsandSubsi['Level1_bvdid'].drop_duplicates(), OEMsandSubsi['Level2_bvdid'].drop_duplicates(), OEMsandSubsi['Sub_BvDID'].drop_duplicates()], axis=0).drop_duplicates())
    df_batfc_fam['OEMorSubsidiary'] = df_batfc_fam['bvdid'].notnull() & df_batfc_fam['bvdid'].isin(OEMsandSubsi)
    mask = df_batfc_fam['bvdid'].isnull()
    df_batfc_fam.loc[df_batfc_fam[mask].index, 'OEMorSubsidiary'] = np.nan
    df_batfc_fam['OEMorSubsidiary'] = df_batfc_fam['OEMorSubsidiary'].replace(True, 1)
    df_batfc_fam['OEMorSubsidiary'] = df_batfc_fam['OEMorSubsidiary'].replace(False, 0)
    return df_batfc_fam


def get_naics_info_and_clean_to_have_a_df_at_uniquebvdid_level():
    # we want one bvdid per row - no duplicates on bvdids
    data_naics = pd.read_csv(PATHS.dropbox / 'Data_outputted/temp/FirmsNaics_of_Families_of_ipc_cpc_transpo.csv')
    # NB: this csv only contains the NAICS of the bvdids that were neither OEMs, subsi or suppliers
    # NB2: some bvdids can have several naics code
    data_naics_suppliers = pd.read_csv(PATHS.dropbox / 'Data_outputted/B_FactsetVariables/suppliers_naics.csv')
    data_naics_suppliers = data_naics_suppliers.rename(columns={'BvD ID number': 'bvdid'})
    data_naics_suppliers = data_naics_suppliers.rename(columns={'Title': 'NAICS, Primary code(s), text description'})
    data_naics_subsi = pd.read_csv(PATHS.dropbox / 'Data_outputted/A_AutoIndustry/subsidiaries_industry.csv')
    data_naics_subsi = data_naics_subsi.rename(columns={'BvD ID number': 'bvdid'})
    data_naics_subsi = data_naics_subsi[['bvdid', 'NAICS, Primary code(s)', 'NAICS, Primary code(s), text description']]
    data_naics_oems = pd.read_csv(PATHS.dropbox / 'Data_outputted/A_AutoIndustry/NAICS336111_primarysecondaryNAICS.csv')
    data_naics_oems = data_naics_oems.rename(columns={'BvD ID number': 'bvdid'})
    data_naics_oems = data_naics_oems.rename(columns={'NAICS 2017, primary code(s)': 'NAICS, Primary code(s)'})
    data_naics_oems = data_naics_oems[['bvdid', 'NAICS, Primary code(s)']].drop_duplicates()
    data_naics = pd.concat([data_naics, data_naics_suppliers, data_naics_subsi, data_naics_oems], axis=0)
    data_naics = data_naics[data_naics['NAICS, Primary code(s)'].notnull()].drop_duplicates()
    data_naics = data_naics[['bvdid', 'NAICS, Primary code(s)']].drop_duplicates().sort_values(by=['bvdid', 'NAICS, Primary code(s)'])
    # Clean NAICS
    data_naics['naics_short4'] = data_naics['NAICS, Primary code(s)'].apply(lambda x: str(int(x)) + '000000')
    data_naics['naics_short4'] = data_naics['naics_short4'].apply(lambda x: x[:4])
    data_naics['naics_short3'] = data_naics['naics_short4'].apply(lambda x: x[:3])
    data_naics['naics_short2'] = data_naics['naics_short4'].apply(lambda x: x[:2])
    data_naics = data_naics[['bvdid', 'naics_short4', 'naics_short3', 'naics_short2']].drop_duplicates()
    data_naics['MotorVehicle'] = data_naics['naics_short4'].apply(lambda x: True if x in ['3361', '3362', '3363'] else False)
    data_naics['OtherTransport'] = data_naics.apply(lambda row: True if (row['naics_short3'] == '336') & (row['naics_short4'] not in ['3361', '3362', '3363']) else False, axis=1)
    data_naics['Electronics'] = data_naics['naics_short3'].apply(lambda x: True if x in ['334', '335'] else False)
    data_naics['MachineryChemical'] = data_naics['naics_short3'].apply(lambda x: True if x in ['333', '325'] else False)
    data_naics['OtherManufacturing'] = data_naics.apply(lambda row: True if (row['naics_short2'] in ['31', '32', '33']) & (row['naics_short3'] not in ['336', '334', '335', '333', '325']) else False, axis=1)
    data_naics['EducationRD'] = data_naics['naics_short3'].apply(lambda x: True if x in ['611', '541'] else False)
    data_naics['Utilities'] = data_naics['naics_short3'].apply(lambda x: True if x == '211' else False)
    data_naics['OtherSector'] = data_naics.apply(lambda row: True if (row['naics_short2'] not in ['31', '32', '33']) & (row['naics_short3'] not in ['611', '541', '211']) else False, axis=1)
    d1 = data_naics.groupby('bvdid')['naics_short4'].nunique().rename('Nbr_NAICS_4digit')
    cols = ['bvdid', 'MotorVehicle', 'OtherTransport', 'OtherManufacturing', 'Electronics', 'MachineryChemical', 'EducationRD', 'Utilities', 'OtherSector']
    d2 = data_naics[cols].drop_duplicates().groupby('bvdid').sum()
    data_naics = pd.concat([d1, d2], axis=1).reset_index()
    return data_naics



def Output_Families_Bat_FC_naics_info():
    df_batfc_fam = get_batfc_families_naics_info()
    # add years
    df_batfc_fam_years = pd.read_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Families_of_ipc_cpc_transpo.csv', usecols=['docdb_family_id', 'earliest_filing_year'])
    df_batfc_fam = df_batfc_fam_years.merge(df_batfc_fam, on='docdb_family_id', how='right')
    df_batfc_fam.to_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Families_Bat_FC_naics_info.csv', index=False)
    return

